fix CI with emulated<256> and avx512f (#1326)#1327
fix CI with emulated<256> and avx512f (#1326)#1327hdu-sdlzx wants to merge 1 commit intoxtensor-stack:masterfrom
Conversation
Signed-off-by: Liu Zixian <zixian.liu@dolphindb.com>
| } | ||
|
|
||
| template <class A> | ||
| template <class A, class = std::enable_if_t<batch<uint16_t, A>::size == 32>> |
There was a problem hiding this comment.
This seems redundant with requires_arch<avx512f>, why do you think this changes anything.
There was a problem hiding this comment.
The error occurs not in the function body, but in instantiation of the batch_constant parameter which is placed before requires_arch.
There was a problem hiding this comment.
Thanks for your answer! Actually this helped me better understand the issue, what do you think of the following, which looks more maintainable to me (and will be even cleaner once we move to C++17):
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 97427f19..cd249c46 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -2588,30 +2588,45 @@ namespace xsimd
I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
};
- }
+ template<class A, uint16_t... Is>
+ constexpr bool is_reduce_pattern() {
+ // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
+ if(sizeof...(Is) != batch<uint16_t, A>::size) return false;
+ uint16_t pattern[] = {Is...};
+ if(pattern[0] != 1)
+ return false;
+ for(size_t i = 1; i < sizeof...(Is); i += 1) {
+ if(pattern[i] != (i & 1))
+ return false;
+ }
+ return true;
+ }
- template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
- XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
- {
- constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
- return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
}
- template <class A>
- XSIMD_INLINE batch<uint16_t, A>
- swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+ template <class A, uint16_t... Idx>
+ XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
{
- // FIXME: this sequence is very inefficient, but it's here to catch
- // a pattern generated by detail::reduce from xsimd_common_math.hpp.
- // The whole pattern is actually decently folded by GCC and Clang,
- // so bare with it.
- constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
- auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+ XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value) {
+ constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+ return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+ }
+ else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>()) {
+ // FIXME: this sequence is very inefficient, but it's here to catch
+ // a pattern generated by detail::reduce from xsimd_common_math.hpp.
+ // The whole pattern is actually decently folded by GCC and Clang,
+ // so bare with it.
+ constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+ auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
- alignas(A::alignment()) uint16_t buffer[32];
- _mm512_store_si512((__m512i*)&buffer[0], tmp);
- buffer[0] = buffer[1];
- return _mm512_load_si512(&buffer[0]);
+ alignas(A::alignment()) uint16_t buffer[32];
+ _mm512_store_si512((__m512i*)&buffer[0], tmp);
+ buffer[0] = buffer[1];
+ return _mm512_load_si512(&buffer[0]);
+ }
+ else {
+ return swizzle(self, mask, common{});
+ }
}
template <class A, uint16_t... Vs>
There was a problem hiding this comment.
@hdu-sdlzx : I applied the patch on #1328 , adding the proper credit in the PR. comments are welcome!
There was a problem hiding this comment.
what do you think of the following, which looks more maintainable to me (and will be even cleaner once we move to C++17):
LGTM
I'm happy with a more readable patch as long as other PRs are not blocked by CI.
|
Obsoleted by #1328 |
No description provided.